Status: Draft
Feb 2, 2021
When looking at disparities for COVID-19 cases between different race/ethnicity groups, we need to keep in mind that some states and counties have severe data completeness issues, as discussed in the case data completeness analysis.
#@title
import pandas as pd
import altair as alt
from vega_datasets import data
from google.colab import auth
auth.authenticate_user()
# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
#%load_ext google.colab.data_table
#@title
# Calculations at https://docs.google.com/spreadsheets/d/11iIgUWzocX1lJrmsMjmfq8Uu9zo1iBsi9R2WQis-TCk/edit#gid=2066332861
prevalence = pd.DataFrame.from_dict({'group': [
'Hispanic/Latino',
'Black',
'White',
'Asian',
'Native Hawaiian/Pacific Islander',
'American Indian/Alaska Native',
'-Total-',
], 'percent': [
.025,
.023,
.021,
.012,
.034,
.038,
.041]
})
bars = alt.Chart(prevalence).mark_bar().encode(
x=alt.X('percent', axis=alt.Axis(format='.1%'), title=''),
y=alt.Y('group', sort='-x', title=''),
color=alt.Color('group',
scale=alt.Scale(scheme='category20'),
title=''),
tooltip=[
alt.Tooltip('group:N', title='Race/Ethnicity Group'),
alt.Tooltip('percent:Q', format='.2%', title='Prevalence within race/ethnicity group'),
]
).properties(
title='Percent of Race/Ethnicity Group in the U.S. who had COVID-19 based on Incomplete CDC Data as of %s' % 'Dec 16'
)
alt.concat(bars).properties(
title=alt.TitleParams(
['Source: U.S. Census Bureau\'s American Community Survey 2019 5-year estimates for population data.'],
baseline='bottom',
dy=20,
orient='bottom',
fontWeight='normal',
fontSize=11
)
).display()
#@title
CASES = 'Cases'
DATASET = 'cdc'
metric = CASES
project_id = 'msm-secure-data-1b'
cdc_table = '`%s.ndunlap_secure.cdc_restricted_access_20201231`' % project_id
date = 'DATE(2020, 12, 16)'
date_int = '20201216'
date_display_name = 'Dec 16'
# Chart settings.
total_cases_scale_max = 2000000
scatter_height = 350
scatter_width = 350
map_height = 350
map_width = 500
us_states = alt.topo_feature(data.us_10m.url, 'states')
us_counties = alt.topo_feature(data.us_10m.url+"#", 'counties')
territories = ('PR', 'GU', 'VI', 'MP', 'AS')
The CDC case data shows 4.09% of the U.S. population having had COVID-19, whereas the CRDT data shows 5.12% of the U.S. population having had COVID-19 up to Dec 16 (based on the CDC data only having 80% of the total cases in the CRDT data). Note that the Total group is larger than all of the other groups because it also includes the 45% of cases in the data that didn't have known race/ethnicity.
#@title
crdt_query = ('''
SELECT
State as state,
Cases_Total as crdt_cases,
Cases_Total - Cases_Unknown as crdt_known_race_cases,
ROUND(1 - Cases_Unknown / Cases_Total, 4) as crdt_known_race_cases_percent,
FROM `msm-secure-data-1b.ndunlap_secure.crdt`
WHERE
date = %s
''' % date_int)
nyt_states_query = ('''
SELECT
state_name,
state_fips_code,
confirmed_cases as nyt_cases,
deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
date = %s AND
state_fips_code IS NOT NULL
''' % date)
nyt_counties_query = ('''
SELECT
county_fips_code,
confirmed_cases as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
date = %s AND
county_fips_code IS NOT NULL
''' % date)
cdc_states_query = ('''
SELECT
res_state,
COUNT(*) as cdc_cases
FROM
%s
GROUP BY
res_state
''' % cdc_table)
cdc_counties_query = ('''
SELECT
res_state,
res_county,
race_ethnicity_combined,
COUNT(*) as cases
FROM
%s
GROUP BY
res_county,
res_state,
race_ethnicity_combined
''' % cdc_table)
#@title
# CDC vs. NYT county
df = pd.io.gbq.read_gbq(cdc_counties_query, project_id=project_id)
for territory in territories:
df = df[df.res_state != territory]
project_id = 'msm-secure-data-1b'
df_county_fips_map = pd.io.gbq.read_gbq(f'''
SELECT
*
FROM
`msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''', project_id=project_id)
df_county_fips_map.crew_county = df_county_fips_map.crew_county.str.lower()
df_county_fips_map['state_county'] = df_county_fips_map.state + '-' + df_county_fips_map.crew_county
df_county_fips_map['state_county'] = df_county_fips_map.state_county.astype('string').str.strip()
df_county_fips_map.set_index('state_county', inplace=True)
#@title
# Concatenate the state and county names because county names are not unique across states.
df.res_county = df.res_county.str.lower()
df['state_county'] = df.res_state + '-' + df.res_county
df['state_county'] = df.state_county.astype('string').str.strip()
df.set_index('state_county', inplace=True)
df['race_ethnicity_combined'] = df.race_ethnicity_combined.astype('string').str.strip()
race_ethnicity_combined_map = {
'Asian, Non-Hispanic': 'asian_cases',
'Black, Non-Hispanic': 'black_cases',
'White, Non-Hispanic': 'white_cases',
'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
'Hispanic/Latino': 'hispanic_cases',
'Multiple/Other, Non-Hispanic': 'other_cases',
'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
'Missing': 'unknown_cases',
'Unknown': 'unknown_cases',
'NA': 'na_cases',
}
df = df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
#@title
merged_df = df.join(df_county_fips_map, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')
# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
crosstab_df = pd.crosstab(merged_df['county_fips'], merged_df.race_ethnicity_combined, values=merged_df.cases, aggfunc=sum,
margins=True,
margins_name='total_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['county_fips'] = crosstab_df.county_fips.astype(int)
crosstab_df['total_known_cases'] = crosstab_df['total_cases'] - crosstab_df.unknown_cases.fillna(0)
crosstab_df['total_known_cases'] = crosstab_df['total_cases'] - crosstab_df.na_cases.fillna(0) - crosstab_df.unknown_cases.fillna(0)
#@title
df_acs_name_lookup = pd.io.gbq.read_gbq(f'''
SELECT
*
FROM
`msm-internal-data.ipums_acs.acs_2019_5year_county`
''', project_id=project_id)
df_acs_name_lookup['state_county'] = df_acs_name_lookup.county.astype('string').str.strip() + ', ' + df_acs_name_lookup.state.astype('string').str.strip()
df_acs_name_lookup.drop(columns=['state', 'county'], inplace=True)
df_acs_name_lookup.set_index('county_fips', inplace=True)
county_chart_df = crosstab_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_df.county_fips = county_chart_df.county_fips.astype(int)
#@title
chart_df = county_chart_df.copy(deep=True)
chart_df.reset_index(inplace=True)
chart_df.county_fips = chart_df.county_fips.astype(int)
chart_df['percent_known_cases'] = round(chart_df.total_known_cases / chart_df.total_cases, 2)
chart_df['total_known_or_na_cases'] = chart_df.total_known_cases + chart_df.na_cases
chart_df['percent_known_or_na_cases'] = round(chart_df.total_known_or_na_cases / chart_df.total_cases, 2)
#@title
race_ethnicity_groups = ['black', 'hispanic', 'aian', 'nhpi', 'asian', 'white', 'other']
def GenerateColNames(group):
cases_col = group + '_cases'
pop_col = group + '_pop'
pop_percent_col = group + '_percent'
cases_percent_col = group + '_cases_percent'
cases_percent_with_unknown_col = group + '_cases_percent_with_unknown'
cases_per_100_col = group + '_cases_per_100'
cases_to_pop_col= group + '_cases_to_pop'
cases_to_pop_with_unknown_col= group + '_cases_to_pop_with_unknown'
return {'cases': cases_col,
'pop': pop_col,
'pop_percent': pop_percent_col,
'cases_per_100': cases_per_100_col,
'cases_percent': cases_percent_col,
'cases_percent_with_unknown': cases_percent_with_unknown_col,
'cases_to_pop': cases_to_pop_col,
'cases_to_pop_with_unknown': cases_to_pop_with_unknown_col,
}
group_names = {}
for group in race_ethnicity_groups:
group_names[group] = GenerateColNames(group)
for group in race_ethnicity_groups:
chart_df[group_names[group]['cases_per_100']] = round(chart_df[group_names[group]['cases']] / chart_df[group_names[group]['pop']], 4)
chart_df[group_names[group]['cases_percent']] = round(chart_df[group_names[group]['cases']] / chart_df.total_known_cases, 2)
chart_df[group_names[group]['cases_percent_with_unknown']] = round(chart_df[group_names[group]['cases']] / chart_df.total_cases, 2)
chart_df[group_names[group]['cases_to_pop']] = round(
chart_df[group_names[group]['cases_percent']] / chart_df[group_names[group]['pop_percent']], 2)
chart_df[group_names[group]['cases_to_pop_with_unknown']] = round(
chart_df[group_names[group]['cases_percent_with_unknown']] / chart_df[group_names[group]['pop_percent']], 2)
chart_df.reset_index(inplace=True)
#@title
filter_data = False
#MIN_POP_PERCENT = 0.001
MIN_POP = 100
MIN_PERCENT_KNOWN = 0.5
MIN_CASES = 5
group_to_display_name = {
'black': 'Black',
'white': 'White',
'hispanic': 'Hispanic/Latino',
'asian': 'Asian',
'nhpi': 'Native Hawaiian/Pacific Islander',
'aian': 'American Indian/Alaska Native',
'other': 'Other or multiple race/ethnicity',
'total': 'Total'
}
group_to_short_name = {
'black': 'Black',
'white': 'White',
'hispanic': 'Hispanic',
'asian': 'Asian',
'nhpi': 'NHPI',
'aian': 'AIAN',
'other': 'Other',
'total': 'Total'
}
chart_col_to_color_scheme = {
'cases_per_100': 'yelloworangebrown',
'cases_to_pop': 'blueorange',
'percent_known_cases': 'redyellowblue',
'percent_known_or_na_cases': 'redyellowblue',
}
chart_col_to_legend_format = {
'cases_per_100': '.0%',
'cases_to_pop': '.1f',
'percent_known_cases': '.0%',
'percent_known_or_na_cases': '.0%',
}
def GenerateCountyMap(chart_df, chart_col, group, group_names, metric, date):
group_chart_col = chart_col
if group:
group_chart_col = group_names[group][chart_col]
group_display_name = ''
if group:
group_display_name = group_to_short_name[group]
group_short_name = group_to_short_name[group]
chart_col_to_range = {
'cases_per_100': [0, .2],
'cases_to_pop': [0, 2],
'percent_known_cases': [0, 1],
'percent_known_or_na_cases': [0, 1],
}
prevalence_text = 'who had COVID-19'
col_to_title = {
'total_cases': group_display_name + ' ' + metric + ' as of ' + date,
'cases_per_100': 'Percent of ' + group_display_name + ' Population ' + prevalence_text + ' as of ' + date,
'cases_to_pop': 'Ratio of ' + group_display_name + ' ' + metric + ' Share to Population Share' + ' as of ' + date,
'percent_known_cases': 'Percent of CDC ' + metric + ' with Known Race/Ethnicity' + ' as of ' + date,
'percent_known_or_na_cases': 'Percent of CDC ' + metric + ' with Known or Suppressed Race/Ethnicity' + ' as of ' + date,
}
filtered_chart_df = chart_df
if group and filter_data:
#filtered_chart_df = filtered_chart_df[filtered_chart_df[group_names[group]['pop_percent']] > MIN_POP_PERCENT]
filtered_chart_df = filtered_chart_df[filtered_chart_df[group_names[group]['pop']] > MIN_POP]
filtered_chart_df = filtered_chart_df[filtered_chart_df['percent_known_cases'] > MIN_PERCENT_KNOWN]
filtered_chart_df = filtered_chart_df[filtered_chart_df[group_names[group]['cases']] > MIN_CASES]
highlight = alt.selection_single(on='mouseover', fields=['id', 'county_fips'], empty='none')
data_cols = ['state_county',
'percent_known_cases',
'percent_known_or_na_cases',
'total_cases']
if group:
data_cols.extend([
group_names[group]['cases'],
group_names[group]['pop'],
group_names[group]['pop_percent'],
group_names[group]['cases_per_100'],
group_names[group]['cases_percent'],
group_names[group]['cases_percent_with_unknown'],
group_names[group]['cases_to_pop'],
group_names[group]['cases_to_pop_with_unknown'],
])
tooltips = [alt.Tooltip('state_county:N', title='County'),
alt.Tooltip('percent_known_cases:Q', format='.0%', title=metric + ' with race/ethnicity')
]
if chart_col in ('percent_known_cases', 'percent_known_or_na_cases'):
tooltips.extend([
alt.Tooltip('total_cases:Q', format=',.0f', title=metric)
])
if chart_col == 'percent_known_or_na_cases':
tooltips.extend([
alt.Tooltip('percent_known_or_na_cases:Q', format='.0%',
title=metric + ' with known or suppressed race/ethnicity')
])
if group:
tooltips.extend([
alt.Tooltip(group_names[group]['cases'] + ':Q', format=',',
title=group_short_name + ' ' + metric.lower()),
])
if chart_col == 'cases_per_100':
tooltips.extend([
alt.Tooltip(group_names[group]['pop'] + ':Q', format=',',
title=group_short_name + ' population'),
alt.Tooltip(group_names[group]['cases_per_100'] + ':Q', format='.2%',
title='Percent ' + prevalence_text)
])
elif chart_col == 'cases_to_pop':
tooltips.extend([
alt.Tooltip(group_names[group]['cases_percent_with_unknown'] + ':Q', format='.1%',
title='Percent of total ' + metric.lower()),
alt.Tooltip(group_names[group]['cases_percent'] + ':Q', format='.1%',
title='Percent of known race/ethnicity ' + metric.lower()),
alt.Tooltip(group_names[group]['pop_percent'] + ':Q', format='.1%',
title=group_short_name + ' percent of population'),
alt.Tooltip(group_names[group]['cases_to_pop'] + ':Q', format='.2f',
title='Ratio of percent of known ' + metric.lower() + ' to percent of population'),
#alt.Tooltip(group_names[group]['cases_to_pop_with_unknown'] + ':Q', format='.2f',
# title='Ratio of ' + metric.lower() + ' to population including unknowns'),
])
reverse_scale = False
if chart_col == 'cases_to_pop':
reverse_scale = True
plot = alt.Chart(us_counties).mark_geoshape(
stroke='white',
strokeOpacity=.2,
strokeWidth=1
).project(
type='albersUsa'
).transform_lookup(
lookup='id',
from_=alt.LookupData(filtered_chart_df, 'county_fips', data_cols)
).encode(
alt.Color(group_chart_col,
type='quantitative',
legend=alt.Legend(format=chart_col_to_legend_format[chart_col]),
scale=alt.Scale(scheme=chart_col_to_color_scheme[chart_col],
reverse=reverse_scale,
domain=chart_col_to_range[chart_col],
clamp=True,
),
title=''),
tooltip=tooltips
).add_selection(
highlight,
)
states_outline = alt.Chart(us_states).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
type='albersUsa'
)
states_fill = alt.Chart(us_states).mark_geoshape(
fill='silver',
stroke='white'
).project('albersUsa')
layered_map = alt.layer(states_fill, plot, states_outline).properties(
title=col_to_title[chart_col],
)
return layered_map
#@title
group_charts = {'cases_per_100': {}, 'cases_to_pop': {}}
for group in race_ethnicity_groups:
for value in ('cases_per_100', 'cases_to_pop'):
group_charts[value][group] = GenerateCountyMap(
chart_df, value, group, group_names, metric, date_display_name)
Rates of COVID-19 cases in each population group:
#@title
black = group_charts['cases_per_100']['black'].properties(width=450, height=325)
hispanic = group_charts['cases_per_100']['hispanic'].properties(width=450, height=325)
white = group_charts['cases_per_100']['white'].properties(width=450, height=325)
asian = group_charts['cases_per_100']['asian'].properties(width=450, height=325)
aian = group_charts['cases_per_100']['aian'].properties(width=450, height=325)
nhpi = group_charts['cases_per_100']['nhpi'].properties(width=450, height=325)
((black | hispanic) &
(white | asian) &
(aian | nhpi)).configure_legend(
orient='top',
gradientLength=400,
titleLimit=0,
).configure_view(
strokeWidth=0,
).display()
We can also view disparities by comparing the percentage of total cases that a race/ethnicity group accounts for in a county (the cases share) vs. the percentage of the total population that a race/ethnicity accounts for in a county (the population share). There is no disparity when the cases share is equal to the population share for all race/ethnicity groups in a county (ratio = 1.0). When the ratio of cases share to population share is above 1.0, then a group has a disproportionate number of cases relative to its share of the population.
#@title
black = group_charts['cases_to_pop']['black'].properties(width=450, height=325)
hispanic = group_charts['cases_to_pop']['hispanic'].properties(width=450, height=325)
white = group_charts['cases_to_pop']['white'].properties(width=450, height=325)
asian = group_charts['cases_to_pop']['asian'].properties(width=450, height=325)
aian = group_charts['cases_to_pop']['aian'].properties(width=450, height=325)
nhpi = group_charts['cases_to_pop']['nhpi'].properties(width=450, height=325)
((black | hispanic) &
(white | asian) &
(aian | nhpi)).configure_legend(
orient='top',
gradientLength=400,
titleLimit=0,
).configure_view(
strokeWidth=0,
).display()
# Large county-level disparity maps
#@title
for group in ['black', 'hispanic', 'white', 'asian', 'aian', 'nhpi']:
(group_charts['cases_per_100'][group]).properties(
width=900,
height=650,
).configure_legend(
orient='top-right',
gradientLength=400,
titleLimit=0,
padding=0
).configure_view(
strokeWidth=0,
).display()
#@title
for group in ['black', 'hispanic', 'white', 'asian', 'aian', 'nhpi']:
(group_charts['cases_to_pop'][group]).properties(
width=900,
height=650,
).configure_legend(
orient='top-right',
gradientLength=400,
titleLimit=0,
padding=0
).configure_view(
strokeWidth=0,
).display()
%%shell
jupyter nbconvert --to html 'cdc_case_disparities.ipynb' --no-input